library(knitr)
opts_chunk$set(warning = F, message = F)

make an example that includes major components of a plot

library(ggplot2)
# library gridextra to arrange multiple plot
library(gridExtra)
library(grid)
ggplot(mpg, aes(factor(cyl), cty)) +
    geom_point(aes(color = drv, size = displ), position = "jitter") +
    stat_boxplot(fill = NA) + 
    facet_wrap(~year) +
    # above are the major information in the plot with other default settings
    
    # fine tuning axis and legend with scale functions ===
    scale_x_discrete(breaks = c(4, 5, 6, 8),
                     labels = c("Four", "Five", "Six", "Eight")) +
    scale_y_continuous(limits = c(4, 36),
                       breaks = 1:4 * 8) +
    scale_color_manual(breaks = c("4", "f", "r"),
                       values = c("4" = "red", "f" = "blue", "r" = "cyan"),
                       labels = c("four-wheel", "front-wheel", "rear-wheel")) +
    scale_size_area(limits = c(2, 7),
                          max_size = 4) +
    
    # guide scales
    guides(
        size = guide_legend(
            title.position = "top",
            nrow = 2,
            byrow = TRUE,
            override.aes = list(shape = 1),
            reverse = TRUE
        ),
        color = guide_legend(
            order = 1,
            nrow = 3,
            override.aes = list(size = 3)
        )
    ) +
    
    # using labs() for all of the titles and labels 
    labs(title = "This plot displays all major ggplot components",
         subtitle = "including data, aes mapping, geom, stat, position, facet, scale, guides, annotaion, and theme",
         caption = "Source: what so ever",
         x = "Cylinders",
         y = "City Mileage (miles/gallon)",
         color = NULL,
         size = "displacement") +
    
    # === provide extra information ===
    
    # annotate extra geoms mannually. Play with geom_text() if want to label in
    # a specific facet panel
    annotate("text", x = 1:2, y = 8, label = "haha", hjust = 0, vjust = 1) +
    
    # === use theme() to make it beautiful ===
    
    theme(plot.background = element_rect(fill = "#F5E6E3"),
          plot.title = element_text(family = "monospace"),
          plot.subtitle = element_text(face = "italic"),
          panel.background = element_rect(fill = "lightblue", color = "red"),
          panel.grid.major.y = element_line(color = "grey95", size = 0.2),
          panel.grid.minor.y = element_blank(),
          axis.ticks = element_blank(),
          axis.title = element_text(family = "monospace"),
          legend.position = "top",
          legend.key = element_blank(),
          legend.margin = margin(0, 0, 0, 0),
          legend.background = element_blank(),
          strip.background = element_blank(),
          strip.text = element_text(size = 12))

legend and axis: labs(), scale_xxx(), and guides()

Legend and axis are the most complex aesthetic system in ggplot. There are many way to present them, which make ggplot powerful and, to the downside, confusing. Here we try to standardize the method. scale_xxx is able to control every component of axis and legend. For better manupilation, however, I’ like to break the components into three part, which are controlled by the following three functions:

  • labs() is for all titles,
  • scale_xxx() determines what to show in axis and legend,
  • guides() determines how to show them.

Inside guides(), each aes is guided by guide_legend() or guide_colorbar(). For more examples, see http://ggplot2.tidyverse.org/reference/guide_legend.html, http://ggplot2.tidyverse.org/reference/guide_colourbar.html, and http://ggplot2.tidyverse.org/reference/guides.html.

ggplot(mpg, aes(hwy, cty, color = factor(cyl), shape = drv)) +
    geom_point(aes(size = displ)) +
    # what to display
    scale_x_continuous(limits = c(0, 50),
                       breaks = c(0, 20, 40),
                       labels = c("zero", "twenty", "fourty"), 
                       minor_breaks = c(5, 10, 15)) +
    scale_y_continuous(labels = NULL) +   # remove labels with NULL
    scale_color_discrete(breaks = c(4, 5, 6, 8), 
                         labels = c("four", "five", "six", "eight")) + # asign colors by default
                                    
    scale_shape_manual(limits = c("f", "r"),      # manually select shape
                       values = c(f = 0, r = 2)) +
    # how to display
    guides(
        size = "none",  # hide size
        color = guide_legend(direction = "horizontal",
                             title.position = "top",
                             nrow = 2,
                             byrow = TRUE,  # arrange row by row, default is by column
                             order = 1),    # first legend to show
        shape = guide_legend(direction = "vertical",
                             reverse = TRUE)  # reverse order of legend keys
    ) +
    # all titles
    labs(title = "Use labs(), scale_xxx(), guides() for legend and axis",
         subtitle = "scale_xxx for what to display, guides for how to display, labs for titles",
         x = "highway mileage",
         y = "city mileage", 
         color = "cylinder", 
         shape = "drive train", 
         size = "haha") 

aes mapping: very flexible

# aes from data.frame and vectors
g1 <- ggplot(mpg) +
    geom_jitter(aes(
        x = class,   # x mapped to column vector mpg$class
        y = rnorm(nrow(mpg)),   # y is a same length vector, or constant
        size = 1:nrow(mpg),      # size is a same length vector
        color = drv,   # vector mpg$drv
        alpha = 0.5    # constant
    )) +
    labs(subtitle = "as long as the same length vector or constant")

# aes from pure vectors
g2 <- ggplot() +
    geom_point(aes(x = 1:3, y = 3:1, 
                   color = c("red", "blue", "green"),
                   shape = letters[1:3]),
               size = 5) +
    scale_color_identity(guide = "legend") +
    labs(subtitle = "can even plot from pure vectors but better reserve for simple plot")

grid.arrange(g1, g2, nrow = 1,
             top = textGrob("aes are very flexible",
                            gp = gpar(fontsize = 16)))

aes mapping: control same aes in different layers

# control same aes across different layers  
ggplot(mpg, aes(hwy, cty)) +
    geom_point() +
    stat_smooth(method = "lm", aes(color = "lm")) +
    stat_smooth(meothd = "loess", aes(color = "loess")) +
    geom_line(aes(hwy, 10, color = "flat")) +
    scale_color_manual(values = c(lm = "red", loess = "blue", flat = "orange")) +
    labs(title = "control same aes in different layers", color = "smooth")

aes mapping: pay attention to group aes

Aes such as color and shape automatically divided data into groups for individual geoms as well as collective geoms. Aes group overrides these default groups for collective geoms. The aes group only used to group data. It does not come with legend.

# group aes override default group derived from other aes ====
df <- data.frame(x = 1:9, y = c(1, 3, 2, 5, 3, 6, 5, 2, 6), 
                 z1 = c("a", "b", "c"), z2 = rep(c("A", "B", "C"), each = 3))
g1 <- ggplot(df, aes(x, y, color = z1)) +
    geom_point(size = 3) +
    geom_line() +
    annotate("text", x = 1, y = 7, label = "geom_line(aes(color = z1))",
             hjust = "inward", vjust = "inward") +
    labs(subtitle = "without group aes, color is the default group for line plot\n")

g2 <- ggplot(df, aes(x, y, color = z1)) +
    geom_point(size = 3) +
    geom_line(aes(group = z2)) +
    annotate("text", x = 1, y = 7, label = "geom_line(aes(color = z1, group = z2))",
             hjust = "inward", vjust = "inward") +
    labs(subtitle = "without group aes, color is the default group for line plot\n")


grid.arrange(g1, g2, nrow = 1,
             top = textGrob("group aes sets groups for collective geoms and overrides default group from other aes",
                            gp = gpar(fontsize = 16)))

aes mapping: force group for line plot is only one observation

df <- data.frame(x = 1:4, y = 1:4, color = letters[1:4])
ggplot(df, aes(x, y, color = color)) +
    geom_line(aes(group = 1111)) +  # group can be any number or string and the result is the same
    geom_point(size = 5) +
    labs(title = "force a constant group",
         subtitle = "color aes generate a default group that has only one observation for each group,\nwhich is not enough for line plot. force constant group for line plot")

stats: generated (or computed) variables

Use ?stat_xxx to find out what variables the stat computes, which can then be used in aes() for plotting.

ggplot(mpg, aes(hwy)) +
    stat_bin(aes(y = ..density..), bins = 10, geom = "line") +
    labs(title = "..density.. for density distribution")

stats: count 1 categorical variable

# The following plot are the same.
g1 <- ggplot(mpg, aes(drv)) +
    stat_count(geom = "bar") +   # the default geom
    stat_count(geom = "point", size = 5, color = "red") +
    stat_count(geom = "line", aes(group = 1), color = "blue", size = 1) +
    labs(subtitle = "stat_xxx function where geom is a argument")

g2 <- ggplot(mpg, aes(drv)) +
    geom_bar(stat = "count") +
    geom_point(stat = "count", size = 5, color = "red") +
    geom_line(stat = "count", aes(group = 1), color = "blue", size = 1) +
    labs(subtitle = "geom_xxx function where stat is a argument")

grid.arrange(g1, g2, nrow = 1,
             top = textGrob("geom_xxx and stat_xxx functions generate the same plot",
                            gp = gpar(fontsize = 16)))

stats: count 1 contineous variables

Focus on stat_xxx functions.

# one contineous variable
ggplot(mpg, aes(hwy)) +
    stat_bin(geom = "bar") +    # default, equals to geom_histogram()
    stat_bin(geom = "line", color = "blue") +   # geom_freqpoly()
    stat_bin(geom = "point", size = 5, color = "red") + 
    labs(title = "count one contineous variable")

stats: count 2 contineous variables

df <- data.frame(x = rnorm(1000), y = rnorm(1000))
g1 <- ggplot(df, aes(x, y)) +
    stat_bin_2d(geom = "tile", bins = 30) +    # default geom good
    labs(subtitle = "geom is raster")
g2 <- ggplot(df, aes(x, y)) +
    stat_bin_hex(geom = "hex", bins = 30) +    # default geom good
    labs(subtitle = "geom is hex")
grid.arrange(g1, g2, nrow = 1,
             top = textGrob("count two contineous variables",
                            gp = gpar(fontsize = 16)))

stats: 1D summary statistics

# discrete x and contineous y
g1 <- ggplot(mpg, aes(drv, hwy)) +
    geom_point(size = 3, color = "grey70") +
    stat_summary(fun.y = max, geom = "tile", fill = "red", alpha = 0.1) +
    # use user defined summary functions
    stat_summary(fun.y = function(s) mean(s + 5), geom = "point", 
                 color = "green", size = 3) +
    # plot range defined by mean and standard deviation
    stat_summary(fun.y = mean, 
                 fun.ymin = function(s) mean(s) - sd(s), 
                 fun.ymax = function(s) mean(s) + sd(s),
                 geom = "pointrange", color = "red") +     # default geom
    labs(subtitle = "discrete x and contineous y")

# contineous x and y
g2 <- ggplot(mpg, aes(cty, hwy)) +
    geom_point(size = 3, color = "grey70") +
    stat_summary_bin(fun.y = max, geom = "tile", bins = 10,
                     fill = "red", alpha = 0.3) +
    # use user defined summary functions
    stat_summary_bin(fun.y = function(s) mean(s + 5), geom = "point", 
                 color = "green", size = 3, bins = 10) +
    # plot range defined by mean and standard deviation
    stat_summary_bin(fun.y = mean, 
                 fun.ymin = function(s) mean(s) - sd(s), 
                 fun.ymax = function(s) mean(s) + sd(s),
                 geom = "pointrange", bins = 10, 
                 color = "red") +     # default geom
    labs(subtitle = "contineous x and contineous y")

grid.arrange(g1, g2, nrow = 1,
             top = textGrob("1D summary statistics -- flexible and powerful",
                            gp = gpar(fontsize = 16)))

stats: 2D summary statistics

df = data.frame(x = rnorm(10000), y = rnorm(10000), z = rnorm(10000))

# min z in 2d bin
g1 <- ggplot(df, aes(x, y, z = z)) +
    stat_summary_2d(fun = min, na.rm = TRUE, bins = 20) +
    labs(subtitle = "plot min of z in each 2-D bin")

g2 <- ggplot(df, aes(x, y, z = z)) +
    stat_summary_hex(fun = max, na.rm = TRUE, bins = 20) +
    labs(subtitle = "plot max of z in each 2D hex")

grid.arrange(g1, g2, nrow = 1,
             top = textGrob("plot 2D count and summury statistics"))

stats: apply weight in statistics

ggplot(midwest, aes(percwhite, percbelowpoverty)) +
    geom_point(aes(size = poptotal / 1e6)) +
    stat_smooth(method = lm, size = 1, color = "red", fill = "lightgreen") +
    stat_smooth(aes(weight = poptotal), method = lm, size = 1) +
    scale_size_area(guide = "none") +
    labs(title = "Apply weight in geom_smooth",
         subtitle = "wieght is an aes")

positions: arrange overlapping graphical objects

g1 <- ggplot(mpg, aes(drv, fill = factor(year))) +
    # the default position
    stat_count(position = "stack") +
    labs(subtitle = 'position = "stack", on top of each other')

g2 <- ggplot(mpg, aes(drv, fill = factor(year))) +
    # side by side
    stat_count(position = "dodge") +
    labs(subtitle = 'position = "dodge", side by side')


g3 <- ggplot(mpg, aes(drv, fill = factor(year))) +
    # identity plot all bars from zero that overlap with each other
    stat_count(position = "identity", alpha = 0.5) +
    labs(subtitle = 'position = "identity", all start from zero')

g4 <- ggplot(mpg, aes(drv, fill = factor(year))) +
    # add up to 1 (100%)
    stat_count(position = "fill") +
    labs(subtitle = 'position = "fill", add up to 100%')

g5 <- ggplot(mpg, aes(drv, fill = factor(year))) +
    # jitter is not useful for bar plot
    stat_count(position = "jitter", alpha = 0.5) +
    labs(subtitle = 'position = "jitter", not useful for bar plot')

grid.arrange(g1, g2, g3, g4, g5, nrow = 2,
             top = textGrob("Five position choices",
                            gp = gpar(fontsize = 16)))

positions: position_jetterdodge() fucntion

position_jitterdodge() dodges points within groups and then add a small noise. Often used to provide data points for geom_box

ggplot(mpg, aes(drv, hwy, color = factor(year))) +
    geom_boxplot() +
    geom_point(position = position_jitterdodge()) +
    labs(title = "position_jitterdodge() dodges jittered points within groups")

positions: position_nudge() function

May be useful to compare plot side by side in different layers.

ggplot(mpg, aes(drv, hwy)) +
    geom_point(color = "grey") +
    geom_point(position = position_nudge(x = 0.1, y = 1)) +
    labs(title = "position_nudge() shifts the whole plot")

positions: geom_col for bar plot

When x and y are provided. For position = “dodge”, the aes(fill) must be categorical.

library(ggplot2)
df = data.frame(x = c("A", "A", "B", "B", "B", "C"), y = 1:6, z = 1:6)


g1 <- ggplot(df, aes(x, y, fill = z)) +
    geom_col(position = "dodge") +
    labs(subtitle = "contineous fill, dodge not working")


g2 <- ggplot(df, aes(x, y, fill = z)) +
    geom_col(position = "stack") +
    labs(subtitle = "contineous fill, stack is ok")

g3 <- ggplot(df, aes(x, y, fill = z)) +
    geom_col(position = "fill") +
    labs(subtitle = "contineous fill, fill is ok")

g4 <- ggplot(df, aes(x, y, fill = factor(z))) +
    geom_col(position = "dodge") +
    labs(subtitle = "categorical fill, dodge is ok")


g5 <- ggplot(df, aes(x, y, fill = factor(z))) +
    geom_col(position = "stack") +
    labs(subtitle = "categorical fill, stack is ok")

g6 <- ggplot(df, aes(x, y, fill = factor(z))) +
    geom_col(position = "fill") +
    labs(subtitle = "categorical fill, fill is ok")

grid.arrange(g1, g2, g3, g4, g5, g6, nrow = 2,
             top = textGrob("Dodge does not work with contineous or no fill"))

positions: dodge for text

library(ggplot2)
df = data.frame(x = c("A", "A", "B", "B", "B", "C"), y = 1:6, z = letters[1:6])

g1 <- ggplot(df, aes(x, y)) +
    geom_col(aes(fill = z), position = "dodge") +
    geom_text(aes(label = z, color = z), 
              position = "dodge", vjust = -0.2) +
    labs(subtitle = 'position = "dodge" does not align text correctly')

# need to fine tune the position_dodge(width = xxx)
g2 <- ggplot(df, aes(x, y)) +
    geom_col(aes(fill = z), position = "dodge") + 
    geom_text(aes(label = z, color = z), 
              position = position_dodge(width = 0.9),
              vjust = -0.2) +
    labs(subtitle = 'position = position_dodge(width = 0.9) works')

grid.arrange(g1, g2, nrow = 1,
             top = textGrob("Use position_dodge() to dodge text"))

geoms: geom_segement to draw vector field

ggplot(seals, aes(lat, long)) +
    geom_point(size = 0.5, color = "red") +
    geom_segment(aes(x = lat, y = long, xend = lat + delta_lat, yend = long + delta_long),
                 color = "blue",
                 arrow = arrow(angle = 20, length = unit(1, "mm"))) +
    labs(title = "Draw a vector field with geom_segment()",
         subtitle = "arrows are drawn with arrow() funtion with specified angle and length")

geoms: geom_rect to draw presidential term

ggplot(data = presidential) +
    # ymin and ymax should be set to -Inf and Inf to cover full data range 
    geom_rect(aes(xmin = start, xmax = end,  ymin = -Inf, ymax = Inf,  
                  fill = party), alpha = 0.2) +
    geom_text(aes(x = start, y = 0.0001, label = name), 
              angle = 90, vjust = 1, hjust = 0) +
    # learn how to generate sequential dates
    scale_x_date(expand = c(0, 0),  
                 breaks = unique(c(presidential$start, presidential$end))) +
    # breaks = seq(min(presidential$start), max(presidential$end), "8 years")) +
    scale_y_continuous(expand = c(0, 0),
                       labels = scales::percent_format()) +
    scale_fill_manual(values = c("Democratic" = "blue", "Republican" = "red")) +
    labs(title = "Draw time blocks with geom_rect()",
         subtitle = "scale ymin and ymax with -Inf and Inf to match other data",
         y = "Unemployed population / total population") +
    theme_bw() +
    theme(panel.grid.major.x = element_line(color = "grey90", size = 0.2),
          panel.grid.major.y = element_blank(),
          panel.grid.minor = element_blank(),
          axis.title.x = element_blank(),
          axis.text.x = element_text(angle = -30, hjust = 0, vjust = 1)) +
    # add layer of unemployment
    geom_line(data = economics, aes(date, unemploy / pop))

geoms: geom_boxplot that many people use

In boxplot, box show 25% to 75% at two ends and 50% inside and the whiskers are by default 1.5 times of box height above and below the box. Outliers are those higher or lower than whiskers and plotted as points. For normal distribution, less than 1% data point are outliers.

set.seed(123)
df <- data.frame(x = letters[1:4], y = rnorm(4000))
ggplot(df, aes(x, y)) +
    geom_jitter(color = "red", alpha = 0.3, width = 0.3, height = 0) +
    geom_boxplot(fill = NA, outlier.alpha = 0.3, outlier.size = 3) +
    labs(x = NULL,
         y = NULL,
         title = "Boxplot: explain the five summary statistics and outliers",
         subtitle = "Less than 1% are outliers in normal distribution, as seen in the plots")

geoms: geom_bar causes confusion

# The default use is to count a single categorical variable
ggplot(mpg, aes(manufacturer, fill = drv)) + 
    geom_bar(position = "stack") +    # this is default position
    labs(title = "bar plot of the count of a categorical variable, aes(catergorial_variable)",
         subtitle = "plotted from raw data before summary statistics")

# however it can be used to plot bar of summarized data
df <- data.frame(x = letters[1:4], y = c(2, 1, 5, 3))
ggplot(df, aes(x, y)) +
    geom_bar(stat = "identity") +
    labs(title = "bar plot from two variables, aes(x, y)",
         subtitle = "after summary statistics. no statistics in the plot so set stat = 'identity'")

geoms: geom_ares plots ribbons

df <- data.frame(
    x = rep(1:4, 3),
    y = c(4, 3, 1, 6, 1, 2, 1, 3, 1, 1, 1, 1),
    group = rep(c("A", "C", "B"), each = 4)
)

# geom_area() plot first level at top by default. To start from bottom, reverse
# levels
df$group <- factor(df$group, levels = rev(levels(df$group)))

# area plot with borders
ggplot(df, aes(x, y, fill = group)) + 
    # color and size are for boundaries
    geom_area(color = "grey50", size = 0.5) +
    labs(title = "Plot ribbons with geom_area()",
         subtitle = "by default, the first level is top which need to be reversed for better view")

geoms: geom_text place text at corner with inward hjust and vjust

“inward” automatically give hjust an vjust a position number 0 or 1 according (x, y) position. For example, if the (x, y) is at the top right relative to center, then hjust = 1, vjust = 1

df <- data.frame(x = c(1, 1.9, 3), y = c(1, 1.9, 3))
ggplot(df, aes(x, y)) +
    geom_point(color = "red") +
    # use geom_text to place label from data.frame
    geom_text(data = data.frame(x = c(1, 1.9, 3), y = c(1, 1.9, 3)), aes(x,y), 
              label = c("at corner inward make\nposition inward", 
                        "near middle the text \nstay towards center\nunless exact center",
                        "no matter what corner\ninward is inward"), 
              hjust = "inward", vjust = "inward") +
    # use annotate("text", ...) to add any text
    annotate("text", x = 1, y = 3, label = "inward works for labels created with\nannotate",
             hjust = "inward", vjust = "inward", color = "red") +
    labs(title = 'Add text with geom_text or annotate("text", ...)',
         subtitle = '"inward" works best near corners')

geoms: geom_contour and geom_raster for surface plot

NO true 3D plot in ggplot.

# need x, y, z data for surfance plot
g1 <- ggplot(faithfuld, aes(eruptions, waiting)) +
    geom_contour(aes(z = density, colour = ..level..)) +
    labs(subtitle = "contour plot")

g2 <- ggplot(faithfuld, aes(eruptions, waiting)) +
    geom_raster(aes(fill = density)) +
    labs(subtitle = "raster plot")

grid.arrange(g1, g2, nrow = 1,
             top = textGrob("ggplot supports surface plots but not true 3D plot",
                            gp = gpar(fontsize = 16)))

geoms: geom_errorbar to plot error

df <- data.frame(x = 1:3, y = c(18, 11, 16), se = c(1.2, 1.5, 1.0))
ggplot(df, aes(x, y, ymin = y - se, ymax = y + se)) +
    # for contineous x
    geom_smooth(stat = "identity") +
    # add standard error plot
    geom_errorbar(width = 0.2, size = 6) +
    # if also want add the point range or line range
    geom_pointrange(size = 3, color = "red") +
    geom_linerange(color = "blue", size = 1, linetype = "solid") +
    # what about add a box
    geom_crossbar(color = "green", width = 0.3) +
    labs(title = "Plot errors when knowing means and errors",
         subtitle = "aes takes ymin and ymax. There are many way to make error plots.")

geoms: many ways to plot distribution

g1 <- ggplot(diamonds, aes(depth)) +
    geom_histogram(aes(fill = cut)) +
    xlim(58, 68) +
    labs(subtitle = "traditional histogram")

g2 <- ggplot(diamonds, aes(depth)) +
    geom_histogram(aes(fill = cut), binwidth = 0.1, position = "fill", na.rm = TRUE) +
    xlim(58, 68) +
    labs(subtitle = "stacked histogram")

g3 <- ggplot(diamonds, aes(depth)) +     # fill does not work for geom_freqpoly()
    geom_freqpoly(aes(color = cut), binwidth = 0.1, na.rm = TRUE) +
    xlim(58, 68) +
    labs(subtitle = "line plot of histogram")

g4 <- ggplot(diamonds, aes(depth)) +
    geom_density(aes(fill = cut, color = cut), alpha = 0.2, na.rm = TRUE) +
    xlim(58, 68) +
    labs(subtitle = "density distribution")

grid.arrange(g1, g2, g3, g4, nrow = 2, 
             top = textGrob("Various way to plot distribution",
                            gp = gpar(fontsize = 16)))

geoms: binned scatter plot

df <- data.frame(x = rnorm(2000), y = rnorm(2000))
norm <- ggplot(df, aes(x, y)) + xlab(NULL) + ylab(NULL)
g1 <- norm + geom_bin2d(bins = 30) +
    labs(subtitle = "geom_bin2d() draw rectangles")
g2 <- norm + geom_hex(bins=30, color = "grey30", size = 0.1) +
    labs(subtitle = "geom_hex() draw hex, considered better generally")

grid.arrange(g1, g2, nrow = 1,
             top = textGrob("turn dense scattered plot into surface density plot"))

scale: transform axis not data

df <- data.frame(x = c(0.1, 1, 1000), y = 1:3, z = c(0.1, 2, 2000))
ggplot(df, aes(x, y, color = z)) +
    geom_point() +
    scale_x_continuous(trans = "log10") +
    scale_y_continuous(trans = "reciprocal") +
    scale_color_continuous(trans = "sqrt") +
    labs(title = "Transform scale not data",
         subtitle = "there are more to offer than just log10")

scale: override aes in legend

# A default aes is also determined by non-aes properties such as size, shape, 
# and alpha outside of aes(). They can be changed with override.aes()
df <- data.frame(x = c(20, 30, 40), y = c(30, 20, 15), z = c("aa", "bb", "cc"))
p <- ggplot(df) +
    geom_point(aes(x, y, color = z), size = 10, alpha = 0.3)

# the default legend is too big and too dim, override them
g1 <- p + labs(subtitle = "default aes: too big and too dim")

# override the legend, can even change the shape
g2 <- p + scale_color_discrete(guide = guide_legend(override.aes = list(size = 3,
                                                                  alpha = 1,
                                                                  shape = 15))) +
    labs(subtitle = "override aes: all applicable aes can be overridden")
grid.arrange(g1, g2, nrow = 1,
             top = textGrob("Override aes in legend if default is not desirable",
                            gp = gpar(fontsize = 16)))

scale: add special format in scale labels

# There are other mannual ways to get the same result, for example, mannually
# tune each labels
ggplot(mpg, aes(hwy, cty, color = drv)) + 
    geom_point() +
    # use scales package to get special format
    scale_x_continuous(limits = c(10, 50),
                       breaks = c(10, 30, 50),
                       labels = scales::percent_format()) +
    scale_y_continuous(labels = scales::dollar_format("$")) + # $ replacible with any characters
    # or mannually by tuning labels with paste0()
    scale_color_discrete(breaks = c("f", "4", "r"),
                         labels = paste0("&_", c("f", "4", "r"), "_&")) +
    labs(title = "Add special format in scales such as $ and %",
         subtitle = 'packages "scaels" get many of these work done but can also be achieved manually\n for example in the legend below')

scale: draw minor breaks for log10 scale

# use %o% to generate breaks for log scale
df <- data.frame(x = c(1, 50, 300, 5000), y = 1:4)
ggplot(df, aes(x, y)) + 
    geom_point() +
    scale_x_continuous(trans = "log10",
                       minor_breaks = (1:10) %o% 10^(0:4)) +
    labs(title = "Draw minor ticks for log10 scale",
         subtitle = "great application of %o% operator")

scale: control empty space surround plot data

# only show in the range of real data
ggplot(map_data("state"), aes(long, lat, group = group)) +
    geom_path() +
    # remove empty space at x-axis, compare to y-axis
    scale_x_continuous(expand = c(0, 0)) +
    # add space c(multiply data range by 0.1, add with 1)
    scale_y_continuous(expand = c(0.1, 1)) +
    coord_map() +
    theme(panel.background = element_rect(fill = NA, color = "red"),
          panel.grid = element_blank(),
          axis.text = element_blank(),
          axis.title = element_blank(),
          axis.ticks = element_blank()) +
    labs(title = "Remove empty space arond a plot",
         subtitle = "by default, extra space is added to the limit of data. it can be removed")

scale: match axis and legend of two plots

# use of limits(), which takes a vector of categorical values
fwd <- subset(mpg, drv == "f")
rwd <- subset(mpg, drv == "r")
class <- unique(mpg$class)
g1 <- ggplot(fwd, aes(displ, hwy, color = class)) +
    geom_point() +
    scale_x_continuous(limits = c(1, 7)) +
    scale_color_discrete(limits = class) +   # limits is a vector of categorical + values
    labs(title = "First plot",
         subtitle = 'show only for drv == "f"')
g2 <- ggplot(rwd, aes(displ, hwy, color = class)) +
    geom_point() +
    scale_x_continuous(limits = c(1, 7)) +
    scale_color_discrete(limits = class) +
    labs(title = "Second plot",
         subtitle = 'show only for drv == "r"')

# display two plot side by side
grid.arrange(g1, g2, nrow = 1, 
             top = textGrob("Forced match of axis and legend for two ggplots that would be different by default\n",
                            gp = gpar(fontsize = 16)))

scale: display colors for missing values

# By default, missing values are colored in gray, but it can be changed.
df <- data.frame(x = 1, y = 1:5, z = c(1, 3, 2, NA, 5))
p <- ggplot(df, aes(x, y)) + geom_tile(aes(fill = z), size =5)
g1 <- p  + labs(subtitle = "NA is grey by default")
g2 <- p + scale_fill_gradient(na.value = NA) +    # no color, i.e. transparent for NA
    labs(subtitle = "set NA to be transparent")
g3 <- p + scale_fill_gradient(na.value = "red") +   # red for NA
    labs(subtitle = "set NA to be red")
grid.arrange(g1, g2, g3, nrow = 1,
             top = textGrob("Display colors for missing values",
                            gp = gpar(fontsize = 16)))

scale: use color brewer

# these palette are pre-determined by the ColorBrewer, users are not allowed 
# define their own palette. 
n <- 13
df <- data.frame(x = 1:n, y = 1:n, z = letters[1:n])
g <- ggplot(df, aes(x, y)) + 
    geom_point(size = 6, color = "grey90") +
    geom_point(size = 6, aes(color = z))

# types make no difference. Only allow a maximun of 9 colors.
g1 <- g + scale_color_brewer(palette = "Set1") + # most distinguishable colors
    labs(subtitle = "palette Set1 only allows 9 colors")
# g + scale_color_brewer(type = "div" , palette = "Set1")
# g + scale_color_brewer(type = "qual", palette = "Set1")

# gray scale of one color. Only allow a maximun of 9 colors.
g2 <- g + scale_color_brewer(palette = "Greens") +
    labs(subtitle = "shades of one color, 9 shades max")

# grays scale of two colors with two colors at the limits. Only allow 11 colors.
g3 <- g + scale_color_brewer(palette = "PuOr") +
    labs(subtitle = "palatte PuOr allows 11 colors")
grid.arrange(g1, g2, g3, nrow = 1, 
             top = textGrob("Color brewer allows limited number of colors in a palatte",
                            gp = gpar(fontsize = 16)))

scale: use user-hand-picked colors

# values is a vector of colors. If a named vector, names are the factors of the 
# aes mapping, in this example, factors in z.

# repeat colors in Set1 in ColorBrewer
set1 <- c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", 
          "#A65628", "#F781BF", "#999999")
color_palette <- rep(set1, 10)
# recycle colors
g + scale_color_manual(values = color_palette) +
    labs(title = "Use hand-picked colors",
         subtitle = "can be any colors of choice")

scale: set limit to highlight selected data

# limit can be set for both contineous and categorical scales
ggplot(mpg, aes(drv, hwy, color = class)) +
    geom_jitter(width = 0.2) +
    # set limit for categorical axis, or use help function xlim("f", "r")
    scale_x_discrete(limits = c("f", "r")) +
    # set limit for contineous axis, help function ylim(NA, 30)
    scale_y_continuous(limits = c(NA, 30)) +
    # set limit for color, no help function for color. set color with na.value for
    # data out of limit. it can be nice way to highlight data
    scale_color_discrete(limits = c("2seater", "midsize"), na.value = "grey80") +
    labs(title = "set limit can selectively display data",
         subtitle = "especially when set limits for discrete colors as not-selected are in grey")

scale: Date needs special treatment

base <- ggplot(economics, aes(date, psavert)) +
    geom_line(na.rm = TRUE) +
    labs(x = NULL, y = NULL)
g1 <- base + labs(subtitle = "default date") # default
g2 <- base + scale_x_date(date_labels = "%y", date_breaks = "5 years") +  # 95, 00, 05 ...
    labs(subtitle = "every five years")
g3 <- base + scale_x_date(limits = as.Date(c("2004-01-01", "2005-01-01")),  # each month
                    date_labels = "%b %y",
                    date_minor_breaks = "1 month") +
    labs(subtitle = "every month")
g4 <- base + scale_x_date(limits = as.Date(c("2004-01-01", "2004-05-01")),
                    date_labels = "%m/%d/%Y",
                    date_breaks = "4 weeks",
                    date_minor_breaks = "1 weeks") +
    labs(subtitle = "every four weeks")

grid.arrange(g1, g2, g3, g4, nrow = 2,
             top = textGrob("Date scale needs spacial treatment",
                            gp = gpar(fontsize = 16)))

facet: facet_wrap() scale control

# facet_wrap()
ggplot(mpg, aes(hwy, cty)) +
    geom_point() +
    # arrange panels column-by-column with dir = "v", row-by-row with dir = "h"
    # control scale of x and y of each panel with scale = "free", or free_x, free_y...
    # strip.position to control where to display strip
    facet_wrap(~ class, nrow = 3, dir = "v", scale = "free_y", 
               strip.position = "left") +
    labs(title = "Control scale in facet_wrap and arrange panels")

facet: facet_grid() scale and space control

ggplot(mpg, aes(cty, model)) +
    geom_point() +
    # when scale is free, space = "free" sets the actual size of panel according
    # to real scale
    facet_grid(manufacturer ~ ., scale = "free_y", space = "free") +
    theme(strip.text.y = element_text(angle = 0)) +
    labs(title = "facet_grid has one more control of display, the space")

facet: add new layer to selected panel

df <- data.frame(x = rnorm(300), y = rnorm(300), z = letters[1:3])
ggplot(df, aes(x, y)) +
    geom_point(alpha = 0.5) +
    facet_wrap(~z) +
    # this line appears in all panels, as it has no z
    geom_line(data = data.frame(x = -1:1, y = -1:1), color = "red", size = 3) +
    # with z, only appear in its own panel b
    geom_line(data = data.frame(x = 1:-1, y = -1:1, z = c("b", "b", "b")), 
              size = 3, color = "blue") +
    labs(title = "Facet: add new layers to selected panels",
         subtitle = "use the variable for facet")

labs: math in title and annotation

ggplot(mpg, aes(hwy, cty, color = drv)) + 
    geom_point() +
    # in quote use "~" for black space
    # google "mathematical annotation in R" for more math expression
    labs(title = quote( Math~works~at~any~title~and~labels~math~such~as~x^2 + x), 
         subtitle = 'pay attention to space and google "mathematical annotation in R" for more math expression',
         x = "high way",
         y = quote(sqrt(x)+log(x)),
         color = quote(frac(x,y))) +
    # set parse = TRUE to parse math expression
    annotate("text", x = 20, y = 30, label = "annotate~label~x%+-%y", parse = TRUE)

guides: use guide_legend() and guide_colorbar() function

ggplot(mpg, aes(factor(cyl), hwy, color = cty, shape = drv)) +
    geom_jitter(width = 0.2, aes(size = displ)) +
    scale_color_gradient2(low = "red", high = "blue", midpoint = 22) +
    guides(
       # move shape legend ahead of color
       shape =  guide_legend(order = 1, 
                             direction = "horizontal",
                             override.aes = list(size = 10,
                                                 color = "grey50"),
                             title.position = "top"),
       # modify contineous color bar
       color = guide_colorbar(reverse = TRUE, 
                              barwidth = unit(3, "cm"),
                              direction = "horizontal",
                              title.position = "top"),
       # size legend second
       size = guide_legend(order = 2,
                           direction = "horizontal",
                           override.aes = list(shape = 1),
                           nrow = 2,
                           byrow = TRUE,
                           title.position = "top")
    ) +
    
    labs(color = "city mileage", shape = "drive train", size = "displacement") +
    theme(panel.background = element_rect(fill = "grey10"),
          legend.key = element_rect(fill = NA)) +
    labs(title = "Control appearance of lengends",
         subtitle = "such as order of multiple legends, direction, arrangement, and title position of each legend")

legend: show and hide legend for multilayers

# fake an aes if there is no real data for the aes
df <- data.frame(x = c(20, 30, 40), y = c(30, 20, 15), z = c("aa", "bb", "cc"))
ggplot(mpg, aes(hwy, cty)) + 
    geom_point(aes(color = drv), show.legend = FALSE) +  # hide legend
    geom_line(data = df, aes(x, y, linetype = "setline")) +  # fake aes of linetype
    geom_point(data = df, aes(x, y, shape = z)) +    # do not use color otherwise mixed with other color aes
    scale_linetype_manual(values = c(setline = "solid"), labels = c("added layer")) +
    labs(title = "Show and hide legends of multilayers",
         subtitle = "hide color legend that would appear by default\nforce line legend that does not appear by default using forced aes",
         linetype = "added line", 
         shape = "additional points")

annotate: create simple annotations from vector data

ggplot(mpg, aes(hwy, cty)) +
    geom_blank() +
    annotate("text", x = Inf, y = Inf, label = "test Inf at\nthis place",
             hjust = 1.1, vjust = 1.1) +
    annotate("text", x = c(20, 30, 40), y = c(10, 20, 15), 
             label = "duplicated text at\nmultiple locations") +
    annotate("text", x = 15, y = 40, angle = 30,
             label = "paste(italic(R) ^ 2, \" = 0.75\")", parse = TRUE) +
    annotate("segment", x = c(20, 20, 20, 20), y = c(45, 40, 35, 30), 
             xend = c(30, 30, 30, 30), yend = c(40, 35, 30, 25),
             arrow = arrow(angle = 20, length = unit(5, "mm"),
                           type = "closed"),
             color = "red", size = 1:4 / 2) +
    annotate("curve", x = 15, y = 30, xend = 18, yend = 40, curvature = 0.9,
             arrow = arrow(angle = 20, length = unit(3, "mm"), type = "closed")) +
    annotate("point", x = c(15, 18), y = c(30, 40), color = "red") +
    annotate("rect", xmin = 15, ymin = 15, xmax = 25, ymax = 25,
             fill = NA, color = "blue", linetype = "dotted") +
    # pointrange at (x, y) and with y range of ymin and ymax
    annotate("pointrange", x = 12, y = 25, ymin = 10, ymax = 30) +
    annotate("point", size = 5,
             x = 30:35, y = c(10, 11, 12, 11, 11, 10), 
             color = c("orange", "red", "blue", "green", "cyan", "black")) +
    annotate("line", x = 30:35, y = c(10, 11, 12, 11, 11, 10)) +
    annotate("step", x = c(35, 38, 40, 43, 45), 
             y = c(30, 35, 33, 40, 41), color = "green") +
    annotate("point", x = c(35, 38, 40, 43, 45), 
             y = c(30, 35, 33, 40, 41), color = "green") +
    annotate("path", x = c(35, 36, 37, 36, 35), y = c(20, 20.5, 22, 24, 27),
             arrow = arrow(angle = 30, length = unit(3, "mm"))) +
    # does not work with geoms of hline and vline
    annotate("hline", yintercept = 28) +   # plot nothing but not error either
    labs(title = "Create simple geoms and annotations from vector data",
         subtitle = "complicated ones should be done with geom_xxx from data in dataframes")

coord: control aspect ratio

g1 <- ggplot(mpg, aes(hwy, cty)) + geom_blank() +
    coord_fixed(ratio = 0.5)

g2 <- ggplot(mpg, aes(hwy, cty)) + geom_blank() +
    coord_fixed(ratio = 1)

g3 <- ggplot(mpg, aes(hwy, cty)) + geom_blank() +
    coord_fixed(ratio = 1.5)

grid.arrange(g1, g2, g3, nrow = 1,
             top = textGrob("Use coord_fixed to control aspect ratio"))

coord: zoom in

g1 <- ggplot(mpg, aes(hwy, cty)) + 
    geom_point() +
    stat_smooth(method = "lm") +
    labs(subtitle = "full range")

g2 <- g1 + coord_cartesian(xlim = c(20, 30), ylim = c(15, 25)) +
    labs(subtitle = "coord_catersian(xlim=, ylim=) keep data outside limits")

g3 <- g1 + xlim(20, 30) + ylim(15, 25) +
    labs(subtitle = "lose data outside limits with xlim()")

grid.arrange(g1, g2, g3, ncol = 1,
             top = textGrob("Keep or lose data outside of limits when zoon in"))

coord: pie chart, bulleye chart

base <- ggplot(mtcars, aes(factor(1), fill = factor(cyl))) +
    stat_count(width = 1)

# bar chart
g1 <- base
# pie chart
g2 <- base + coord_polar(theta = "y")
# bylleye chart
g3 <- base + coord_polar()
grid.arrange(g1, g2, g3, nrow = 1,
             top = textGrob("Pie chart and bulleye chart"))

theme: theme_update() theme_set()

theme_update() is a funky function. It does two things: first, it changes the theme as specified by its parameters; second, it returns the old theme before update.

# this is the default theme
theme_set(theme_grey())

# update theme and return the old theme
old_theme <- theme_update(
    plot.background = element_rect(fill = "lightblue3", color = NA),
    panel.background = element_rect(fill = "lightblue", color = NA),
    axis.text = element_text(color = "linen"),
    axis.title = element_text(color = "linen")
)

df <- data.frame(x = 1:3, y = 1:3)
base <- ggplot(df, aes(x, y)) + geom_point()

# plot using the updated theme
base + labs(title = "updated theme")

# reset back to old theme
theme_set(old_theme)
base + labs(title = "old theme")

tricks: use cut_width() function to categorize contineous variables

df <- data.frame(x = 1:10000, y = rnorm(10000))
g1 <- ggplot(df, aes(x, y)) +
    geom_boxplot(aes(group = cut_width(x, 2000, boundary = 0))) +
    labs(subtitle = "cut width of 2000 along x starting from left limit")

g2 <- ggplot(df, aes(x, y)) +
    geom_boxplot(aes(color = cut_number(y, 5))) +
    labs(subtitle = "cut y into 5 groups with about equal number")

g3 <- ggplot(df, aes(x, y)) +
    geom_boxplot(aes(fill = cut_interval(y, 5))) +
    labs(subtitle = "cut y into 5 intervals of equal width")

grid.arrange(g1, g2, g3, nrow = 1,
             top = textGrob("Use ggplot2's cut_xxx() to categorize contineous variable",
                            gp = gpar(fontsize = 16)))

skip for now, need more work when needed

  • $6.6.1 date and time
  • $6.6.2 color